library(nycflights13)
library(tidyverse)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
flights_sml <- select(flights, year:day, ends_with(“delay”), distance, air_time ) mutate(flights_sml, gain = dep_delay - arr_delay, speed = distance / air_time * 60 )
mutate(flights_sml, gain = dep_delay - arr_delay, hours = air_time / 60, gain_per_hour = gain / hours )
transmute(flights, dep_time, hour = dep_time %/% 100, minute = dep_time %% 100 )
flights
1).
1504 %/% 100
## [1] 15
1504 %% 100
## [1] 4
1504 %/% 100 * 60 + 1504 %% 100
## [1] 904
flights_times <- mutate(flights, dep_time_mins = (dep_time %/% 100 * 60 + dep_time %% 100) %% 1440, sched_dep_time_mins = (sched_dep_time %/% 100 * 60 + sched_dep_time %% 100) %% 1440 ) # view only relevant columns select( flights_times, dep_time, dep_time_mins, sched_dep_time, sched_dep_time_mins )
We could define a function time2mins(), which converts a vector of times in from the format used in flights to minutes since midnight.
2).
air_time is the difference between the arrival (arr_time) and departure times (dep_time). In other words, air_time = arr_time - dep_time.
The flight passes midnight, so arr_time < dep_time. In these cases, the difference in airtime should be by 24 hours (1,440 minutes).
The flight crosses time zones, and the total air time will be off by hours (multiples of 60).
Given the time-zones in the US, the differences due to time-zone should be 60 minutes (Central) 120 minutes (Mountain), 180 minutes (Pacific), 240 minutes (Alaska), or 300 minutes (Hawaii).
3). the departure delay (dep_delay) to be equal to the difference between scheduled departure time (sched_dep_time), and actual departure time (dep_time), dep_time - sched_dep_time = dep_delay.
4).
flights_delayed <- mutate(flights,
dep_delay_min_rank = min_rank(desc(dep_delay)),
dep_delay_row_number = row_number(desc(dep_delay)),
dep_delay_dense_rank = dense_rank(desc(dep_delay))
)
flights_delayed <- filter(
flights_delayed,
!(dep_delay_min_rank > 10 | dep_delay_row_number > 10 |
dep_delay_dense_rank > 10)
)
flights_delayed <- arrange(flights_delayed, dep_delay_min_rank)
print(select(
flights_delayed, month, day, carrier, flight, dep_delay,
dep_delay_min_rank, dep_delay_row_number, dep_delay_dense_rank
),
n = Inf
)
## # A tibble: 10 x 8
## month day carrier flight dep_delay dep_delay_min_r… dep_delay_row_n…
## <int> <int> <chr> <int> <dbl> <int> <int>
## 1 1 9 HA 51 1301 1 1
## 2 6 15 MQ 3535 1137 2 2
## 3 1 10 MQ 3695 1126 3 3
## 4 9 20 AA 177 1014 4 4
## 5 7 22 MQ 3075 1005 5 5
## 6 4 10 DL 2391 960 6 6
## 7 3 17 DL 2119 911 7 7
## 8 6 27 DL 2007 899 8 8
## 9 7 22 DL 2047 898 9 9
## 10 12 5 AA 172 896 10 10
## # … with 1 more variable: dep_delay_dense_rank <int>
dplyr package provides multiple functions for ranking, which differ in how they handle tied values: row_number(), min_rank(), dense_rank()
row_number() assigns each element a unique value
min_rank() and dense_rank() assign tied values the same rank, but differ in how they assign values to the next rank
5).
c(1 + 1, 2 + 2, 3 + 3, 1 + 4, 2 + 5, 3 + 6, 1 + 7, 2 + 8, 3 + 9, 1 + 10)
## [1] 2 4 6 5 7 9 8 10 12 11
6).
by_dest <- group_by(flights, dest) delay <- summarise(by_dest, count = n(), dist = mean(distance, na.rm = TRUE), delay = mean(arr_delay, na.rm = TRUE) ) delay <- filter(delay, count > 20, dest != “HNL”)
ggplot(data = delay, mapping = aes(x = dist, y = delay)) + geom_point(aes(size = count), alpha = 1/3) + geom_smooth(se = FALSE)
There are three steps to prepare this data:
Group flights by destination.
Summarise to compute distance, average delay, and number of flights.
Filter to remove noisy points and Honolulu airport, which is almost twice as far away as the next closest airport.
flights %>% group_by(year, month, day) %>% summarise(mean = mean(dep_delay))
delays <- not_cancelled %>% group_by(tailnum) %>% summarise( delay = mean(arr_delay) )
ggplot(data = delays, mapping = aes(x = delay)) + geom_freqpoly(binwidth = 10)
delays <- not_cancelled %>% group_by(tailnum) %>% summarise( delay = mean(arr_delay, na.rm = TRUE), n = n() )
ggplot(data = delays, mapping = aes(x = n, y = delay)) + geom_point(alpha = 1/10)
The mean is the sum divided by the length; the median is a value where 50% of x is above it, and 50% is below it.
The interquartile range IQR(x) and median absolute deviation mad(x) are robust equivalents that may be more useful if you have outliers.
To count the number of distinct (unique) values, use n_distinct(x)
This makes sum() and mean() very useful: sum(x) gives the number of TRUEs in x, and mean(x) gives the proportion.
daily <- group_by(flights, year, month, day) (per_day <- summarise(daily, flights = n()))
(per_month <- summarise(per_day, flights = sum(flights)))
(per_year <- summarise(per_month, flights = sum(flights)))
1). If a flight is always 30 minutes late and that delay is known, then it is as if the arrival time is that delayed time.
2). not_cancelled %>% group_by(dest) %>% summarise(n = length(dest))
3).
filter(flights, !is.na(dep_delay), is.na(arr_delay)) %>% select(dep_time, arr_time, sched_arr_time, dep_delay, arr_delay) - most important column is arr_delay, which indicates the amount of delay in arrival.
4). cancelled_and_delays <- flights %>% mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>% group_by(year, month, day) %>% summarise( cancelled_prop = mean(cancelled), avg_dep_delay = mean(dep_delay, na.rm = TRUE), avg_arr_delay = mean(arr_delay, na.rm = TRUE) ) %>%
ggplot(cancelled_and_delays) + geom_point(aes(x = avg_arr_delay, y = cancelled_prop))
5).
flights %>%
group_by(carrier) %>%
summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(arr_delay))
6). The sort argument to count() sorts the results in order of n. You could use this anytime you would run count() followed by arrange()
1).
2).
flights %>%
filter(!is.na(tailnum)) %>%
mutate(on_time = !is.na(arr_time) & (arr_delay <= 0)) %>%
group_by(tailnum) %>%
summarise(on_time = mean(on_time), n = n()) %>%
filter(min_rank(on_time) == 1)
3).
flights %>%
group_by(hour) %>%
summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(arr_delay)
4).
cancelled_and_delays <-
flights %>%
mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>%
group_by(year, month, day) %>%
summarise(
cancelled_prop = mean(cancelled),
avg_dep_delay = mean(dep_delay, na.rm = TRUE),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)
) %>%
ungroup()
flights %>%
filter(arr_delay > 0) %>%
group_by(dest) %>%
mutate(
arr_delay_total = sum(arr_delay),
arr_delay_prop = arr_delay / arr_delay_total
) %>%
select(
dest, month, day, dep_time, carrier, flight,
arr_delay, arr_delay_prop
) %>%
arrange(dest, desc(arr_delay_prop))
5). After about 8-hours, a delayed flight is likely to be followed by a flight leaving on time.
lagged_delays <- flights %>%
arrange(origin, month, day, dep_time) %>%
group_by(origin) %>%
mutate(dep_delay_lag = lag(dep_delay)) %>%
filter(!is.na(dep_delay), !is.na(dep_delay_lag))
lagged_delays %>%
group_by(dep_delay_lag) %>%
summarise(dep_delay_mean = mean(dep_delay)) %>%
ggplot(aes(y = dep_delay_mean, x = dep_delay_lag)) +
geom_point() +
scale_x_continuous(breaks = seq(0, 1500, by = 120)) +
labs(y = "Departure Delay", x = "Previous Departure Delay")
lagged_delays %>%
group_by(origin, dep_delay_lag) %>%
summarise(dep_delay_mean = mean(dep_delay)) %>%
ggplot(aes(y = dep_delay_mean, x = dep_delay_lag)) +
geom_point() +
facet_wrap(~origin, ncol = 1) +
labs(y = "Departure Delay", x = "Previous Departure Delay")
flights %>%
mutate(mph = distance / (air_time / 60)) %>%
arrange(desc(mph)) %>%
select(
origin, dest, mph, year, month, day, dep_time, flight, carrier,
dep_delay, arr_delay
)
7).
flights %>%
# find all airports with > 1 carrier
group_by(dest) %>%
mutate(n_carriers = n_distinct(carrier)) %>%
filter(n_carriers > 1) %>%
# rank carriers by numer of destinations
group_by(carrier) %>%
summarize(n_dest = n_distinct(dest)) %>%
arrange(desc(n_dest))
filter(airlines, carrier == "EV")
filter(airlines, carrier %in% c("AS", "F9", "HA"))
8).The exception is flights on the days on which daylight savings started (March 10) or ended (November 3).
flights %>%
select(tailnum, year, month, day, dep_delay) %>%
filter(!is.na(dep_delay)) %>%
arrange(tailnum, year, month, day) %>%
group_by(tailnum) %>%
mutate(cumulative_hr_delays = cumsum(dep_delay > 60)) %>%
summarise(total_flights = sum(cumulative_hr_delays < 1)) %>%
arrange(total_flights)